{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import jieba\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn import metrics\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.naive_bayes import BernoulliNB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [],
   "source": [
    "import os,shutil\n",
    "def mycopyfile(srcfile,dstfile):\n",
    "    if not os.path.isfile(srcfile):\n",
    "        print(\"dst not exist!\")\n",
    "    else:\n",
    "        fpath,fname=os.path.split(dstfile)    #分离文件名和路径\n",
    "        if not os.path.exists(fpath):\n",
    "            os.makedirs(fpath)                #创建路径\n",
    "        shutil.copyfile(srcfile,dstfile)      #复制文件\n",
    "# src_path=r\"D:\\Download\\kagglecatsanddogs_3367a\\PetImages\\1.jpg\"\n",
    "# dst_path=r\"D:\\Download\\kagglecatsanddogs_3367a\\1.jpg\"\n",
    "# mycopyfile(src_path,dst_path)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "outputs": [],
   "source": [
    "# 划分数据集\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "corpus_root_path = 'D:/大三下/NLP_Foundation/datasets/thucnews'\n",
    "target_root_path = '../../docs/categrogy/'\n",
    "filelist = os.listdir(corpus_root_path)\n",
    "\n",
    "for category in filelist:\n",
    "    corpus_file_path = os.path.join(corpus_root_path, category)\n",
    "    fnames = os.listdir(corpus_file_path)\n",
    "    ftrain_names, ftest_names = train_test_split(fnames, test_size=0.2, random_state=41)\n",
    "    for fname in ftrain_names:\n",
    "        src_path = os.path.join(corpus_file_path, fname)\n",
    "        dst_path = os.path.join(target_root_path + 'train/' + category, fname)\n",
    "        mycopyfile(src_path, dst_path)\n",
    "\n",
    "    for fname in ftest_names:\n",
    "        src_path = os.path.join(corpus_file_path, fname)\n",
    "        dst_path = os.path.join(target_root_path + 'test/' + category, fname)\n",
    "        mycopyfile(src_path, dst_path)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "outputs": [],
   "source": [
    "def loadfile(filepath, category):\n",
    "    '''加载文件内容和标签'''\n",
    "    filelist = os.listdir(filepath)\n",
    "    content = []\n",
    "    label = []\n",
    "\n",
    "    for file in filelist:\n",
    "        with open(filepath + \"/\" + file, encoding='utf-8') as f:\n",
    "            content.append(\"\".join(jieba.cut(f.read())))\n",
    "            label.append(category)\n",
    "    return content, label"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10/10 [10:32<00:00, 63.23s/it]\n",
      "100%|██████████| 10/10 [02:53<00:00, 17.34s/it]\n"
     ]
    }
   ],
   "source": [
    "target_root_path = '../../docs/categrogy/'\n",
    "textdir = [os.path.join(target_root_path, 'train'),\n",
    "           os.path.join(target_root_path, 'test')]\n",
    "# currentdir = os.getcwd()\n",
    "traincontent = []\n",
    "trainlabel = []\n",
    "testcontent = []\n",
    "testlabel = []\n",
    "for text in textdir:  #['train','test']\n",
    "    #     os.chdir(text)\n",
    "    textlist = os.listdir(text)  #【女性，体育，文学，校园】\n",
    "    for category in tqdm(textlist):\n",
    "        content, label = loadfile(text + \"/\" + category, category)\n",
    "        if text == os.path.join(target_root_path, 'train'):\n",
    "            traincontent += content\n",
    "            trainlabel += label\n",
    "        elif text == os.path.join(target_root_path, 'test'):\n",
    "            testcontent += content\n",
    "            testlabel += label\n",
    "        # os.chdir(currentdir)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "outputs": [
    {
     "data": {
      "text/plain": "['$',\n '0',\n '1',\n '2',\n '3',\n '4',\n '5',\n '6',\n '7',\n '8',\n '9',\n '?',\n '_',\n '“',\n '”',\n '、',\n '。',\n '《',\n '》',\n '一',\n '一些',\n '一何',\n '一切',\n '一则',\n '一方面',\n '一旦',\n '一来',\n '一样',\n '一般',\n '一转眼',\n '万一',\n '上',\n '上下',\n '下',\n '不',\n '不仅',\n '不但',\n '不光',\n '不单',\n '不只',\n '不外乎',\n '不如',\n '不妨',\n '不尽',\n '不尽然',\n '不得',\n '不怕',\n '不惟',\n '不成',\n '不拘',\n '不料',\n '不是',\n '不比',\n '不然',\n '不特',\n '不独',\n '不管',\n '不至于',\n '不若',\n '不论',\n '不过',\n '不问',\n '与',\n '与其',\n '与其说',\n '与否',\n '与此同时',\n '且',\n '且不说',\n '且说',\n '两者',\n '个',\n '个别',\n '临',\n '为',\n '为了',\n '为什么',\n '为何',\n '为止',\n '为此',\n '为着',\n '乃',\n '乃至',\n '乃至于',\n '么',\n '之',\n '之一',\n '之所以',\n '之类',\n '乌乎',\n '乎',\n '乘',\n '也',\n '也好',\n '也罢',\n '了',\n '二来',\n '于',\n '于是',\n '于是乎',\n '云云',\n '云尔',\n '些',\n '亦',\n '人',\n '人们',\n '人家',\n '什么',\n '什么样',\n '今',\n '介于',\n '仍',\n '仍旧',\n '从',\n '从此',\n '从而',\n '他',\n '他人',\n '他们',\n '以',\n '以上',\n '以为',\n '以便',\n '以免',\n '以及',\n '以故',\n '以期',\n '以来',\n '以至',\n '以至于',\n '以致',\n '们',\n '任',\n '任何',\n '任凭',\n '似的',\n '但',\n '但凡',\n '但是',\n '何',\n '何以',\n '何况',\n '何处',\n '何时',\n '余外',\n '作为',\n '你',\n '你们',\n '使',\n '使得',\n '例如',\n '依',\n '依据',\n '依照',\n '便于',\n '俺',\n '俺们',\n '倘',\n '倘使',\n '倘或',\n '倘然',\n '倘若',\n '借',\n '假使',\n '假如',\n '假若',\n '傥然',\n '像',\n '儿',\n '先不先',\n '光是',\n '全体',\n '全部',\n '兮',\n '关于',\n '其',\n '其一',\n '其中',\n '其二',\n '其他',\n '其余',\n '其它',\n '其次',\n '具体地说',\n '具体说来',\n '兼之',\n '内',\n '再',\n '再其次',\n '再则',\n '再有',\n '再者',\n '再者说',\n '再说',\n '冒',\n '冲',\n '况且',\n '几',\n '几时',\n '凡',\n '凡是',\n '凭',\n '凭借',\n '出于',\n '出来',\n '分别',\n '则',\n '则甚',\n '别',\n '别人',\n '别处',\n '别是',\n '别的',\n '别管',\n '别说',\n '到',\n '前后',\n '前此',\n '前者',\n '加之',\n '加以',\n '即',\n '即令',\n '即使',\n '即便',\n '即如',\n '即或',\n '即若',\n '却',\n '去',\n '又',\n '又及',\n '及',\n '及其',\n '及至',\n '反之',\n '反而',\n '反过来',\n '反过来说',\n '受到',\n '另',\n '另一方面',\n '另外',\n '另悉',\n '只',\n '只当',\n '只怕',\n '只是',\n '只有',\n '只消',\n '只要',\n '只限',\n '叫',\n '叮咚',\n '可',\n '可以',\n '可是',\n '可见',\n '各',\n '各个',\n '各位',\n '各种',\n '各自',\n '同',\n '同时',\n '后',\n '后者',\n '向',\n '向使',\n '向着',\n '吓',\n '吗',\n '否则',\n '吧',\n '吧哒',\n '吱',\n '呀',\n '呃',\n '呕',\n '呗',\n '呜',\n '呜呼',\n '呢',\n '呵',\n '呵呵',\n '呸',\n '呼哧',\n '咋',\n '和',\n '咚',\n '咦',\n '咧',\n '咱',\n '咱们',\n '咳',\n '哇',\n '哈',\n '哈哈',\n '哉',\n '哎',\n '哎呀',\n '哎哟',\n '哗',\n '哟',\n '哦',\n '哩',\n '哪',\n '哪个',\n '哪些',\n '哪儿',\n '哪天',\n '哪年',\n '哪怕',\n '哪样',\n '哪边',\n '哪里',\n '哼',\n '哼唷',\n '唉',\n '唯有',\n '啊',\n '啐',\n '啥',\n '啦',\n '啪达',\n '啷当',\n '喂',\n '喏',\n '喔唷',\n '喽',\n '嗡',\n '嗡嗡',\n '嗬',\n '嗯',\n '嗳',\n '嘎',\n '嘎登',\n '嘘',\n '嘛',\n '嘻',\n '嘿',\n '嘿嘿',\n '因',\n '因为',\n '因了',\n '因此',\n '因着',\n '因而',\n '固然',\n '在',\n '在下',\n '在于',\n '地',\n '基于',\n '处在',\n '多',\n '多么',\n '多少',\n '大',\n '大家',\n '她',\n '她们',\n '好',\n '如',\n '如上',\n '如上所述',\n '如下',\n '如何',\n '如其',\n '如同',\n '如是',\n '如果',\n '如此',\n '如若',\n '始而',\n '孰料',\n '孰知',\n '宁',\n '宁可',\n '宁愿',\n '宁肯',\n '它',\n '它们',\n '对',\n '对于',\n '对待',\n '对方',\n '对比',\n '将',\n '小',\n '尔',\n '尔后',\n '尔尔',\n '尚且',\n '就',\n '就是',\n '就是了',\n '就是说',\n '就算',\n '就要',\n '尽',\n '尽管',\n '尽管如此',\n '岂但',\n '己',\n '已',\n '已矣',\n '巴',\n '巴巴',\n '并',\n '并且',\n '并非',\n '庶乎',\n '庶几',\n '开外',\n '开始',\n '归',\n '归齐',\n '当',\n '当地',\n '当然',\n '当着',\n '彼',\n '彼时',\n '彼此',\n '往',\n '待',\n '很',\n '得',\n '得了',\n '怎',\n '怎么',\n '怎么办',\n '怎么样',\n '怎奈',\n '怎样',\n '总之',\n '总的来看',\n '总的来说',\n '总的说来',\n '总而言之',\n '恰恰相反',\n '您',\n '惟其',\n '慢说',\n '我',\n '我们',\n '或',\n '或则',\n '或是',\n '或曰',\n '或者',\n '截至',\n '所',\n '所以',\n '所在',\n '所幸',\n '所有',\n '才',\n '才能',\n '打',\n '打从',\n '把',\n '抑或',\n '拿',\n '按',\n '按照',\n '换句话说',\n '换言之',\n '据',\n '据此',\n '接着',\n '故',\n '故此',\n '故而',\n '旁人',\n '无',\n '无宁',\n '无论',\n '既',\n '既往',\n '既是',\n '既然',\n '时候',\n '是',\n '是以',\n '是的',\n '曾',\n '替',\n '替代',\n '最',\n '有',\n '有些',\n '有关',\n '有及',\n '有时',\n '有的',\n '望',\n '朝',\n '朝着',\n '本',\n '本人',\n '本地',\n '本着',\n '本身',\n '来',\n '来着',\n '来自',\n '来说',\n '极了',\n '果然',\n '果真',\n '某',\n '某个',\n '某些',\n '某某',\n '根据',\n '欤',\n '正值',\n '正如',\n '正巧',\n '正是',\n '此',\n '此地',\n '此处',\n '此外',\n '此时',\n '此次',\n '此间',\n '毋宁',\n '每',\n '每当',\n '比',\n '比及',\n '比如',\n '比方',\n '没奈何',\n '沿',\n '沿着',\n '漫说',\n '焉',\n '然则',\n '然后',\n '然而',\n '照',\n '照着',\n '犹且',\n '犹自',\n '甚且',\n '甚么',\n '甚或',\n '甚而',\n '甚至',\n '甚至于',\n '用',\n '用来',\n '由',\n '由于',\n '由是',\n '由此',\n '由此可见',\n '的',\n '的确',\n '的话',\n '直到',\n '相对而言',\n '省得',\n '看',\n '眨眼',\n '着',\n '着呢',\n '矣',\n '矣乎',\n '矣哉',\n '离',\n '竟而',\n '第',\n '等',\n '等到',\n '等等',\n '简言之',\n '管',\n '类如',\n '紧接着',\n '纵',\n '纵令',\n '纵使',\n '纵然',\n '经',\n '经过',\n '结果',\n '给',\n '继之',\n '继后',\n '继而',\n '综上所述',\n '罢了',\n '者',\n '而',\n '而且',\n '而况',\n '而后',\n '而外',\n '而已',\n '而是',\n '而言',\n '能',\n '能否',\n '腾',\n '自',\n '自个儿',\n '自从',\n '自各儿',\n '自后',\n '自家',\n '自己',\n '自打',\n '自身',\n '至',\n '至于',\n '至今',\n '至若',\n '致',\n '般的',\n '若',\n '若夫',\n '若是',\n '若果 ',\n '若非',\n '莫不然',\n '莫如',\n '莫若',\n '虽',\n '虽则',\n '虽然',\n '虽说',\n '被',\n '要',\n '要不',\n '要不是',\n '要不然',\n '要么',\n '要是',\n '譬喻',\n '譬如',\n '让',\n '许多',\n '论',\n '设使',\n '设或',\n '设若',\n '诚如',\n '诚然',\n '该',\n '说来',\n '诸',\n '诸位',\n '诸如',\n '谁',\n '谁人',\n '谁料',\n '谁知',\n '贼死',\n '赖以',\n '赶',\n '起',\n '起见',\n '趁',\n '趁着',\n '越是',\n '距',\n '跟',\n '较',\n '较之',\n '边',\n '过',\n '还',\n '还是',\n '还有',\n '还要',\n '这',\n '这一来',\n '这个',\n '这么',\n '这么些',\n '这么样',\n '这么点儿',\n '这些',\n '这会儿',\n '这儿',\n '这就是说',\n '这时',\n '这样',\n '这次',\n '这般',\n '这边',\n '这里',\n '进而',\n '连',\n '连同',\n '逐步',\n '通过',\n '遵循',\n '遵照',\n '那',\n '那个',\n '那么',\n '那么些',\n '那么样',\n '那些',\n '那会儿',\n '那儿',\n '那时',\n '那样',\n '那般',\n '那边',\n '那里',\n '都',\n '鄙人',\n '鉴于',\n '针对',\n '阿',\n '除',\n '除了',\n '除外',\n '除开',\n '除此之外',\n '除非',\n '随',\n '随后',\n '随时',\n '随着',\n '难道说',\n '非但',\n '非徒',\n '非特',\n '非独',\n '靠',\n '顺',\n '顺着',\n '首先',\n '！',\n '，',\n '：',\n '；',\n '？',\n '']"
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stopwords_path = '../../docs/categrogy/stopwords/cn_stopwords.txt'\n",
    "with open(stopwords_path, encoding='utf-8') as file:\n",
    "    stopwords = file.read().split(\"\\n\")\n",
    "stopwords"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "outputs": [],
   "source": [
    "tfidf = TfidfVectorizer(stop_words=stopwords, max_df=0.5)\n",
    "traindata = tfidf.fit_transform(traincontent)\n",
    "testdata = tfidf.transform(testcontent)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "多项式朴素贝叶斯文本分类的准确率为： 0.8836153846153846\n"
     ]
    }
   ],
   "source": [
    "#多项式朴素贝叶斯\n",
    "nb_model = MultinomialNB(alpha=0.001)\n",
    "nb_model.fit(traindata,trainlabel)\n",
    "predict_test = nb_model.predict(testdata)\n",
    "print(\"多项式朴素贝叶斯文本分类的准确率为：\",metrics.accuracy_score(predict_test,testlabel))"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bernoulli贝叶斯文本分类的准确率为： 0.7926923076923077\n"
     ]
    }
   ],
   "source": [
    "#bernoulli朴素贝叶斯\n",
    "from sklearn.naive_bayes import BernoulliNB\n",
    "ber_model = BernoulliNB(alpha=0.001)\n",
    "ber_model.fit(traindata,trainlabel)\n",
    "ber_predict = ber_model.predict(testdata)\n",
    "print(\"bernoulli贝叶斯文本分类的准确率为：\",metrics.accuracy_score(ber_predict,testlabel))"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "outputs": [
    {
     "ename": "MemoryError",
     "evalue": "Unable to allocate 1.17 TiB for an array with shape (52000, 3101172) and data type float64",
     "output_type": "error",
     "traceback": [
      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[1;31mMemoryError\u001B[0m                               Traceback (most recent call last)",
      "\u001B[1;32m<ipython-input-44-47bc4687a2e9>\u001B[0m in \u001B[0;36m<module>\u001B[1;34m\u001B[0m\n\u001B[0;32m      1\u001B[0m \u001B[1;31m#高斯贝叶斯分类器\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m      2\u001B[0m \u001B[0mgauss_model\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mGaussianNB\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 3\u001B[1;33m \u001B[0mgauss_model\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mfit\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mtraindata\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mtoarray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m,\u001B[0m\u001B[0mtrainlabel\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m      4\u001B[0m \u001B[0mgauss_predict\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mber_model\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mpredict\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mtestdata\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mtoarray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m      5\u001B[0m \u001B[0mprint\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;34m\"GaussianNB贝叶斯文本分类的准确率为：\"\u001B[0m\u001B[1;33m,\u001B[0m\u001B[0mmetrics\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0maccuracy_score\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mgauss_predict\u001B[0m\u001B[1;33m,\u001B[0m\u001B[0mtestlabel\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32mD:\\Documents\\anaconda3\\envs\\ml\\scipy\\sparse\\compressed.py\u001B[0m in \u001B[0;36mtoarray\u001B[1;34m(self, order, out)\u001B[0m\n\u001B[0;32m   1037\u001B[0m         \u001B[1;32mif\u001B[0m \u001B[0mout\u001B[0m \u001B[1;32mis\u001B[0m \u001B[1;32mNone\u001B[0m \u001B[1;32mand\u001B[0m \u001B[0morder\u001B[0m \u001B[1;32mis\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   1038\u001B[0m             \u001B[0morder\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_swap\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;34m'cf'\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m[\u001B[0m\u001B[1;36m0\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 1039\u001B[1;33m         \u001B[0mout\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_process_toarray_args\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0morder\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mout\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m   1040\u001B[0m         \u001B[1;32mif\u001B[0m \u001B[1;32mnot\u001B[0m \u001B[1;33m(\u001B[0m\u001B[0mout\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mflags\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mc_contiguous\u001B[0m \u001B[1;32mor\u001B[0m \u001B[0mout\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mflags\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mf_contiguous\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   1041\u001B[0m             \u001B[1;32mraise\u001B[0m \u001B[0mValueError\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;34m'Output array must be C or F contiguous'\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32mD:\\Documents\\anaconda3\\envs\\ml\\scipy\\sparse\\base.py\u001B[0m in \u001B[0;36m_process_toarray_args\u001B[1;34m(self, order, out)\u001B[0m\n\u001B[0;32m   1200\u001B[0m             \u001B[1;32mreturn\u001B[0m \u001B[0mout\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   1201\u001B[0m         \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 1202\u001B[1;33m             \u001B[1;32mreturn\u001B[0m \u001B[0mnp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mzeros\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mshape\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0morder\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0morder\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m   1203\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   1204\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;31mMemoryError\u001B[0m: Unable to allocate 1.17 TiB for an array with shape (52000, 3101172) and data type float64"
     ]
    }
   ],
   "source": [
    "# 无法运行，scipy无法转为array，分配空间不足\n",
    "# Unable to allocate 1.17 TiB for an array with shape (52000, 3101172) and data type float64\n",
    "#高斯贝叶斯分类器\n",
    "# gauss_model = GaussianNB()\n",
    "# gauss_model.fit(traindata.toarray(),trainlabel)\n",
    "# gauss_predict = ber_model.predict(testdata.toarray())\n",
    "# print(\"GaussianNB贝叶斯文本分类的准确率为：\",metrics.accuracy_score(gauss_predict,testlabel))"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "outputs": [],
   "source": [
    "result_root_path = '../../docs/categrogy/result/'\n",
    "np.save(result_root_path + 'traindata', traindata)\n",
    "np.save(result_root_path + 'trainlabel', trainlabel)\n",
    "np.save(result_root_path + 'testdata', testdata)\n",
    "np.save(result_root_path + 'testlabel', testlabel)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "outputs": [
    {
     "data": {
      "text/plain": "array(<52000x3101172 sparse matrix of type '<class 'numpy.float64'>'\n\twith 3981164 stored elements in Compressed Sparse Row format>,\n      dtype=object)"
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.load(result_root_path + 'traindata' + '.npy', allow_pickle=True)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "outputs": [
    {
     "data": {
      "text/plain": "<52000x3101172 sparse matrix of type '<class 'numpy.float64'>'\n\twith 3981164 stored elements in Compressed Sparse Row format>"
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "traindata"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [],
   "source": [
    "from sklearn import svm\n",
    "\n",
    "X = [[0], [1], [2], [3]]\n",
    "Y = [0, 1, 2, 3]\n",
    "clf = svm.SVC(decision_function_shape='ovo')\n",
    "clf.fit(X, Y)\n",
    "# SVC(decision_function_shape='ovo')\n",
    "pred = clf.predict([[1], [0]])\n",
    "# dec = clf.decision_function([])"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "outputs": [
    {
     "data": {
      "text/plain": "{1, 3}"
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = set([1, 2, 3])\n",
    "a.add(5)\n",
    "a\n",
    "c = set()\n",
    "c.add(3)\n",
    "c.add(1)\n",
    "c"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "outputs": [
    {
     "data": {
      "text/plain": "[1, 2, 3, 5, 1, 3]"
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b = []\n",
    "b.extend(a)\n",
    "b.extend(c)\n",
    "b"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "conda-env-ml-py",
   "language": "python",
   "display_name": "Python [conda env:ml] *"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}